圖像辨識與卷積神經網路

林嶔 (Lin, Chin)

Lesson 6

圖像辨識基礎(1)

F6_1

– 我們已經利用了多層感知機解決了眾多問題,那我們是否能在圖像方面也能使用多層感知機找到一個能夠精準進行手寫數字辨識的預測函數?

圖像辨識基礎(2)

– 請在這裡下載MNIST的手寫數字資料,並讓我們了解一下這筆資料的結構

– 一個28×28的黑白圖片的其實可以被表示成784個介於0至255的數字,這樣我們就又能把問題轉換為單純的預測問題了。

library(data.table)

DAT = fread("data/MNIST.csv", data.table = FALSE)
DAT = data.matrix(DAT)

#Split data

set.seed(0)
Train.sample = sample(1:nrow(DAT), nrow(DAT)*0.6, replace = FALSE)

Train.X = DAT[Train.sample,-1]
Train.Y = DAT[Train.sample,1]
Test.X = DAT[-Train.sample,-1]
Test.Y = DAT[-Train.sample,1]

#Display

library(OpenImageR)

imageShow(matrix(as.numeric(Train.X[1,]), nrow = 28, byrow = TRUE))

圖像辨識基礎(3)

– 這個時候我們會面對到一個硬體的問題,那就是我們不可能預先把所有檔案都讀到RAM內。比較好的解決方法是每次訓練時只讀取小批量的訓練樣本,這樣就能有效降低記憶體的使用。

fwrite(x = data.table(cbind(Train.Y, Train.X)),
       file = 'data/train_data.csv',
       col.names = FALSE, row.names = FALSE)

fwrite(x = data.table(cbind(Test.Y, Test.X)),
       file = 'data/test_data.csv',
       col.names = FALSE, row.names = FALSE)
library(mxnet)

my_iterator_func <- setRefClass("Custom_Iter",
                                fields = c("iter", "data.csv", "data.shape", "batch.size"),
                                contains = "Rcpp_MXArrayDataIter",
                                methods = list(
                                  initialize = function(iter, data.csv, data.shape, batch.size){
                                    csv_iter <- mx.io.CSVIter(data.csv = data.csv, data.shape = data.shape, batch.size = batch.size)
                                    .self$iter <- csv_iter
                                    .self
                                  },
                                  value = function(){
                                    val <- as.array(.self$iter$value()$data)
                                    val.x <- val[-1,]
                                    val.y <- t(model.matrix(~ -1 + factor(val[1,], levels = 0:9)))
                                    dim(val.x) <- c(nrow(val.x), ncol(val.x))
                                    val.y <- array(val.y, dim = c(10, ncol(val.x)))
                                    val.x <- mx.nd.array(val.x)
                                    val.y <- mx.nd.array(val.y)
                                    list(data=val.x, label=val.y)
                                  },
                                  iter.next = function(){
                                    .self$iter$iter.next()
                                  },
                                  reset = function(){
                                    .self$iter$reset()
                                  },
                                  finalize=function(){
                                  }
                                )
)

my_iter = my_iterator_func(iter = NULL,  data.csv = 'data/train_data.csv', data.shape = 785, batch.size = 20)
my_iter$reset()
my_iter$iter.next()
## [1] TRUE
my_value = my_iter$value()

imageShow(matrix(as.numeric(as.array(my_value$data)[,1]), nrow = 28, byrow = TRUE))

print(as.array(my_value$label)[,1])
##  [1] 0 0 0 0 0 0 0 0 1 0

圖像辨識基礎(4)

my.model.FeedForward.create = function (Iterator, ctx = mx.cpu(), save.grad = FALSE,
                                        loss_symbol, pred_symbol,
                                        Optimizer, num_round = 30) {
  
  require(abind)
  
  #0. Check data shape
  Iterator$reset()
  Iterator$iter.next()
  my_values <- Iterator$value()
  input_shape <- lapply(my_values, dim)
  batch_size <- tail(input_shape[[1]], 1)
  
  #1. Build an executor to train model
  exec_list = list(symbol = loss_symbol, ctx = ctx, grad.req = "write")
  exec_list = append(exec_list, input_shape)
  my_executor = do.call(mx.simple.bind, exec_list)
  
  #2. Set the initial parameters
  mx.set.seed(0)
  new_arg = mxnet:::mx.model.init.params(symbol = loss_symbol,
                                         input.shape = input_shape,
                                         output.shape = NULL,
                                         initializer = mxnet:::mx.init.uniform(0.01),
                                         ctx = ctx)
  mx.exec.update.arg.arrays(my_executor, new_arg$arg.params, match.name = TRUE)
  mx.exec.update.aux.arrays(my_executor, new_arg$aux.params, match.name = TRUE)
  
  #3. Define the updater
  my_updater = mx.opt.get.updater(optimizer = Optimizer, weights = my_executor$ref.arg.arrays)
  
  #4. Forward/Backward
  message('Start training:')
  
  set.seed(0)
  if (save.grad) {epoch_grad = NULL}
  
  for (i in 1:num_round) {
    
    Iterator$reset()
    batch_loss = list()
    if (save.grad) {batch_grad = list()}
    batch_seq = 0
    t0 = Sys.time()
    
    while (Iterator$iter.next()) {
      
      my_values <- Iterator$value()
      mx.exec.update.arg.arrays(my_executor, arg.arrays = my_values, match.name = TRUE)
      mx.exec.forward(my_executor, is.train = TRUE)
      mx.exec.backward(my_executor)
      update_args = my_updater(weight = my_executor$ref.arg.arrays, grad = my_executor$ref.grad.arrays)
      mx.exec.update.arg.arrays(my_executor, update_args, skip.null = TRUE)
      batch_loss[[length(batch_loss) + 1]] = as.array(my_executor$ref.outputs[[1]])
      if (save.grad) {
        grad_list = sapply(my_executor$ref.grad.arrays, function (x) {if (!is.null(x)) {mean(abs(as.array(x)))}})
        grad_list = unlist(grad_list[grepl('weight', names(grad_list), fixed = TRUE)])
        batch_grad[[length(batch_grad) + 1]] = grad_list
      }
      batch_seq = batch_seq + 1
      
    }
    
    message(paste0("epoch = ", i,
                   ": loss = ", formatC(mean(unlist(batch_loss)), format = "f", 4),
                   " (Speed: ", formatC(batch_seq * batch_size/as.numeric(Sys.time() - t0, units = 'secs'), format = "f", 2), " sample/secs)"))
    
    if (save.grad) {epoch_grad = rbind(epoch_grad, apply(abind(batch_grad, along = 2), 1, mean))}
    
  }
  
  if (save.grad) {
    
    epoch_grad[epoch_grad < 1e-8] = 1e-8
    
    COL = rainbow(ncol(epoch_grad))
    random_pos = 2^runif(ncol(epoch_grad), -0.5, 0.5)
    
    plot(epoch_grad[,1] * random_pos[1], type = 'l', col = COL[1],
         xlab = 'epoch', ylab = 'mean of abs(grad)', log = 'y',
         ylim = range(epoch_grad))
    
    for (i in 2:ncol(epoch_grad)) {lines(1:nrow(epoch_grad), epoch_grad[,i] * random_pos[i], col = COL[i])}
    
    legend('topright', paste0('layer', 1:ncol(epoch_grad), '_weight'), col = COL, lwd = 1)
    
  }
  
  #5. Get model
  my_model <- mxnet:::mx.model.extract.model(symbol = pred_symbol,
                                             train.execs = list(my_executor))
  
  return(my_model)
  
}

圖像辨識基礎(5)

– 我們先試試看5隱藏層網路,這個網路在昨天的IRIS資料集中是不work的,現在呢?

– 這是Model architecture:

data = mx.symbol.Variable(name = 'data')
label = mx.symbol.Variable(name = 'label')
fc1 = mx.symbol.FullyConnected(data = data, num.hidden = 50, name = 'fc1')
relu1 = mx.symbol.Activation(data = fc1, act.type = 'relu', name = 'relu1')
fc2 = mx.symbol.FullyConnected(data = relu1, num.hidden = 50, name = 'fc2')
relu2 = mx.symbol.Activation(data = fc2, act.type = 'relu', name = 'relu2')
fc3 = mx.symbol.FullyConnected(data = relu2, num.hidden = 50, name = 'fc3')
relu3 = mx.symbol.Activation(data = fc3, act.type = 'relu', name = 'relu3')
fc4 = mx.symbol.FullyConnected(data = relu3, num.hidden = 50, name = 'fc4')
relu4 = mx.symbol.Activation(data = fc4, act.type = 'relu', name = 'relu4')
fc5 = mx.symbol.FullyConnected(data = relu4, num.hidden = 50, name = 'fc5')
relu5 = mx.symbol.Activation(data = fc5, act.type = 'relu', name = 'relu5')
fc6 = mx.symbol.FullyConnected(data = relu5, num.hidden = 10, name = 'fc6')
softmax_layer = mx.symbol.softmax(data = fc6, axis = 1, name = 'softmax_layer')

eps = 1e-8
m_log = 0 - mx.symbol.mean(mx.symbol.broadcast_mul(mx.symbol.log(softmax_layer + eps), label))
m_logloss = mx.symbol.MakeLoss(m_log, name = 'm_logloss')

– 這是Optimizer:

my_optimizer = mx.opt.create(name = "adam", learning.rate = 0.001, beta1 = 0.9, beta2 = 0.999,
                             epsilon = 1e-08, wd = 1e-4)

圖像辨識基礎(6)

model = my.model.FeedForward.create(Iterator = my_iter, ctx = mx.cpu(), save.grad = TRUE,
                                    loss_symbol = m_logloss, pred_symbol = softmax_layer,
                                    Optimizer = my_optimizer, num_round = 30)

predict_Y = predict(model, t(Test.X), array.layout = "colmajor")
confusion_table = table(max.col(t(predict_Y)), Test.Y)
cat("Testing accuracy rate =", sum(diag(confusion_table))/sum(confusion_table))
## Testing accuracy rate = 0.9591071
print(confusion_table)
##     Test.Y
##         0    1    2    3    4    5    6    7    8    9
##   1  1645    0   12    2    6   13   11    5   11    9
##   2     0 1834    7    4    2    4   12    5   12    3
##   3     1    3 1559   12    1    0    3   12   12    1
##   4     2    2   30 1652    2   19    1    9   38   15
##   5     0    1    6    0 1548    2    9    5    1   28
##   6     8    1    8   33    1 1488   17    3   29    9
##   7     2    0    2    0    7    5 1603    0    4    0
##   8     0    0   11    8    5    2    0 1682    4   18
##   9     4    8   18   18    3   11    5    3 1550    7
##   10    1    2    3   13   31    7    0   29   14 1552

– 我們發現,5層隱藏層的網路在MNIST數據集中不但work,而且結果還算準(96.1%)

練習1:深淺網路比較

sum(sapply(lapply(model$arg.params, dim), prod))
## [1] 49960

– 請你設計一個只有1個隱藏層(各層神經元數量為62)的網路並進行準確度測試!

– 你也可以訓練一個再深一點的網路試試看!

練習1答案

data = mx.symbol.Variable(name = 'data')
label = mx.symbol.Variable(name = 'label')
fc1 = mx.symbol.FullyConnected(data = data, num.hidden = 62, name = 'fc1')
relu1 = mx.symbol.Activation(data = fc1, act.type = 'relu', name = 'relu1')
fc2 = mx.symbol.FullyConnected(data = relu1, num.hidden = 10, name = 'fc2')
softmax_layer = mx.symbol.softmax(data = fc2, axis = 1, name = 'softmax_layer')

eps = 1e-8
m_log = 0 - mx.symbol.mean(mx.symbol.broadcast_mul(mx.symbol.log(softmax_layer + eps), label))
m_logloss = mx.symbol.MakeLoss(m_log, name = 'm_logloss')

model = my.model.FeedForward.create(Iterator = my_iter, ctx = mx.cpu(), save.grad = TRUE,
                                    loss_symbol = m_logloss, pred_symbol = softmax_layer,
                                    Optimizer = my_optimizer, num_round = 30)

predict_Y = predict(model, t(Test.X), array.layout = "colmajor")
confusion_table = table(max.col(t(predict_Y)), Test.Y)
cat("Testing accuracy rate =", sum(diag(confusion_table))/sum(confusion_table))
## Testing accuracy rate = 0.9095833

卷積神經網路介紹(1)

– 但回到我們的手寫數字分類問題,當我們看到這些手寫數字時,我們一眼就能認出他們了,但從「圖片」到「概念」的過程真的這麼簡單嗎?

– 現在我們面對的是視覺問題,看來除了模擬大腦思考運作的過程之外,我們還需要模擬眼睛的作用!

卷積神經網路介紹(2)

F6_2

– 他們的研究發現,貓咪在受到不同形狀的圖像刺激時,感受野的腦部細胞會產生不同反應

F6_3

卷積神經網路介紹(3)

– 卷積器模擬了感受野最初的細胞,他們負責用來辨認特定特徵,他們的數學模式如下:

F6_4

– 「特徵圖」的意義是什麼呢?卷積器就像是最初級的視覺細胞,他們專門辨認某一種簡單特徵,那這個「特徵圖」上面數字越大的,就代表那個地方越符合該細胞所負責的特徵。

F6_5

卷積神經網路介紹(4)

F6_6

F6_7

卷積神經網路介紹(5)

– 我們想像有一張人的圖片,假定第一個卷積器是辨認眼睛的特徵,第二個卷積器是在辨認鼻子的特徵,第三個卷積器是在辨認耳朵的特徵,第四個卷積器是在辨認手掌的特徵,第五個卷積器是在辨認手臂的特徵

– 第1.2.3張特徵圖中數值越高的地方,就分別代表眼睛、鼻子、耳朵最有可能在的位置,那將這3張特徵圖合在一起看再一次卷積,是否就能辨認出人臉的位置?

– 第4.5張特徵圖中數值越高的地方,就分別代表手掌、手臂最有可能在的位置,那將這2張特徵圖合在一起看再一次卷積,是否就能辨認出的位置?

– 第4.5張特徵圖對人臉辨識同樣能起到作用,因為人臉不包含手掌、手臂,因此如果有個卷積器想要辨認人臉,他必須對第1.2.3張特徵圖做正向加權,而對第4.5張特徵圖做負向加權

F6_8

卷積神經網路介紹(6)

– 儘管存在著形狀改變的問題,但卷積器的運算過程可以視為是一種線性運算,因此假若我們能將卷積的過程作一些轉換,那他的梯度與之前的線性轉換的部分很類似。

– 假定\(X\)是一個輸入矩陣,而\(W\)是一個卷積核,而\(O\)是卷積運算之後的輸出,那我們可以把整個過程轉換成類似這樣的形態(下面是一個最簡單的例子,假設輸入是3×3,而卷積核為2×2,輸出則2×2是)

\[\begin{align} O & = Conv(X, W) \\ O' &= X'W' \\\\ X & = \begin{pmatrix} x_{1,1} & x_{1,2} & x_{1,3} \\ x_{2,1} & x_{2,2} & x_{2,3} \\ x_{3,1} & x_{3,2} & x_{3,3} \end{pmatrix} \ \ \ \ \ X' = \begin{pmatrix} x_{1,1} & x_{1,2} & x_{2,1} & x_{2,2} \\ x_{2,1} & x_{2,2} & x_{3,1} & x_{3,2} \\ x_{1,2} & x_{1,3} & x_{2,2} & x_{2,3} \\ x_{2,2} & x_{2,3} & x_{3,2} & x_{3,3} \end{pmatrix} \\\\ W & = \begin{pmatrix} w_{1,1} & w_{1,2} \\ w_{2,1} & w_{2,2} \end{pmatrix} \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ W' = \begin{pmatrix} w_{1,1} \\ w_{1,2} \\ w_{2,1} \\ w_{2,2} \end{pmatrix} \\\\ O & = \begin{pmatrix} o_{1,1} & o_{1,2} \\ o_{2,1} & o_{2,2} \end{pmatrix} \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ O' = \begin{pmatrix} o_{1,1} = X'_{i,1}W' \\ o_{1,2} = X'_{i,2}W' \\ o_{2,1} = X'_{i,3}W' \\ o_{2,2} = X'_{i,4}W' \end{pmatrix} \end{align}\]

– 在這裡我們假設反向傳播的過程中已經取得了輸出\(O\)的梯度\(gard.O\),並從這裡開始往下:

\[\begin{align} \frac{\partial}{\partial W'} O' & = X' \\ \frac{\partial}{\partial X'} O' & = W' \\\\ gard.W' &= \frac{1}{n} \otimes (X')^T \bullet gard.O' \\ gard.X' &= gard.O' \bullet (W')^T \end{align}\]

卷積神經網路介紹(7)

– 我們先看「average pooling」的型態,假定\(X\)是一個輸入矩陣,而\(O\)是池化運算之後的輸出(池化步幅為1×1,池化核為2×2):

\[\begin{align} O & = Pool(X) \\\\ X & = \begin{pmatrix} x_{1,1} & x_{1,2} & x_{1,3} \\ x_{2,1} & x_{2,2} & x_{2,3} \\ x_{3,1} & x_{3,2} & x_{3,3} \end{pmatrix} \\ O & = \begin{pmatrix} o_{1,1} = mean(x_{1,1}, \ x_{1,2}, \ x_{2,1}, \ x_{2,2}) & o_{1,2} = mean(x_{1,2}, \ x_{1,3}, \ x_{2,2}, \ x_{2,3})\\ o_{2,1} = mean(x_{2,1}, \ x_{2,2}, \ x_{3,1}, \ x_{3,2}) & o_{2,2} = mean(x_{2,2}, \ x_{2,3}, \ x_{3,2}, \ x_{3,3}) \end{pmatrix} \end{align}\]

\[\begin{align} grad.O & = \begin{pmatrix} grad.o_{1,1} & grad.o_{1,2} \\ grad.o_{2,1} & grad.o_{2,2} \end{pmatrix} \\ grad.X & = \begin{pmatrix} grad.x_{1,1} = \frac{grad.o_{1,1}}{4} & grad.x_{1,2} = \frac{grad.o_{1,1} + grad.o_{1,2}}{4} & grad.x_{1,3} =\frac{grad.o_{1,2}}{4}\\ grad.x_{2,1} = \frac{grad.o_{1,1} + grad.o_{2,1}}{4} & grad.x_{2,2} = \frac{grad.o_{1,1} + grad.o_{2,1} + grad.o_{1,2} + grad.o_{2,2}}{4} & grad.x_{2,3} = \frac{grad.o_{1,2} + grad.o_{2,2}}{4} \\ grad.x_{3,1} = \frac{grad.o_{2,1}}{4} & grad.x_{3,2} = \frac{grad.o_{2,1} + grad.o_{2,2}}{4} & grad.x_{3,3} = \frac{grad.o_{2,2}}{4} \end{pmatrix} \end{align}\]

卷積神經網路介紹(8)

– 由於不存在數學符號表示最大的元素為何,這裡我們帶入一組真實數字來看看結果

\[\begin{align} O & = Pool(X) \\\\ X & = \begin{pmatrix} x_{1,1} & x_{1,2} & x_{1,3} \\ x_{2,1} & x_{2,2} & x_{2,3} \\ x_{3,1} & x_{3,2} & x_{3,3} \end{pmatrix} \ \ \ \ \ \ \ X = \begin{pmatrix} 5 & 3 & 4 \\ 8 & 1 & 2 \\ 6 & 7 & 9 \end{pmatrix} \\ O & = \begin{pmatrix} o_{1,1} = x_{2,1} & o_{1,2} = x_{1,3} \\ o_{2,1} = x_{2,1} & o_{2,2} = x_{3,3} \end{pmatrix} \end{align}\]

\[\begin{align} grad.O & = \begin{pmatrix} grad.o_{1,1} & grad.o_{1,2} \\ grad.o_{2,1} & grad.o_{2,2} \end{pmatrix} \\ grad.X & = \begin{pmatrix} grad.x_{1,1} = 0 & grad.x_{1,2} = 0 & grad.x_{1,3} = grad.o_{1,2} \\ grad.x_{2,1} = grad.o_{1,1} + grad.o_{2,1} & grad.x_{2,2} = 0 & grad.x_{2,3} = 0 \\ grad.x_{3,1} = 0 & grad.x_{3,2} = 0 & grad.x_{3,3} = grad.o_{2,2} \end{pmatrix} \end{align}\]

練習2:看看卷積器的效果

– 這是一張鸚鵡的圖片

library(imager)

img <- load.image(system.file("extdata/parrots.png", package="imager"))
gary.img <- grayscale(img)

plot(gary.img)

– 我們試著用一個特殊結構的卷積器取得他的特徵圖吧!

conv.filter.1 = matrix(c(-1, -1, -1,
                         -1, +8, -1,
                         -1, -1, -1), nrow = 3)

img.matrix = as.matrix(gary.img)

feature.img = matrix(NA, nrow = nrow(img.matrix) - 2, ncol = ncol(img.matrix) - 2)

for (i in 1:nrow(feature.img)) {
  for (j in 1:ncol(feature.img)) {
    sub.img.matrix = img.matrix[0:2+i,0:2+j]
    feature.img[i,j] = sum(sub.img.matrix * conv.filter.1)
  }
}

new.img = as.cimg(feature.img)

plot(new.img)

– 請你試著使用其他卷積器來試試看吧?

conv.filter.2 = matrix(c(-1, 0, +1,
                         -2, 0, +2,
                         -1, 0, +1), nrow = 3)

練習2答案

feature.img = matrix(NA, nrow = nrow(img.matrix) - 2, ncol = ncol(img.matrix) - 2)

for (i in 1:nrow(feature.img)) {
  for (j in 1:ncol(feature.img)) {
    sub.img.matrix = img.matrix[0:2+i,0:2+j]
    feature.img[i,j] = sum(sub.img.matrix * conv.filter.2)
  }
}

new.img = as.cimg(feature.img)

plot(new.img)

利用卷積神經網路做手寫數字辨識(1)

F6_9

利用卷積神經網路做手寫數字辨識(2)

my_iterator_func <- setRefClass("Custom_Iter",
                                fields = c("iter", "data.csv", "data.shape", "batch.size"),
                                contains = "Rcpp_MXArrayDataIter",
                                methods = list(
                                  initialize = function(iter, data.csv, data.shape, batch.size){
                                    csv_iter <- mx.io.CSVIter(data.csv = data.csv, data.shape = data.shape, batch.size = batch.size)
                                    .self$iter <- csv_iter
                                    .self
                                  },
                                  value = function(){
                                    val <- as.array(.self$iter$value()$data)
                                    val.x <- val[-1,]
                                    val.y <- t(model.matrix(~ -1 + factor(val[1,], levels = 0:9)))
                                    val.y <- array(val.y, dim = c(10, ncol(val.x)))
                                    dim(val.x) <- c(28, 28, 1, ncol(val.x))
                                    val.x <- mx.nd.array(val.x)
                                    val.y <- mx.nd.array(val.y)
                                    list(data=val.x, label=val.y)
                                  },
                                  iter.next = function(){
                                    .self$iter$iter.next()
                                  },
                                  reset = function(){
                                    .self$iter$reset()
                                  },
                                  finalize=function(){
                                  }
                                )
)

my_iter = my_iterator_func(iter = NULL,  data.csv = 'data/train_data.csv', data.shape = 785, batch.size = 20)
my_iter$reset()
my_iter$iter.next()
## [1] TRUE
my_value = my_iter$value()

library(OpenImageR)

imageShow(matrix(as.numeric(as.array(my_value$data)[,,,1]), nrow = 28, byrow = TRUE))

print(as.array(my_value$label)[,1])
##  [1] 0 0 0 0 0 0 0 0 1 0
my_optimizer = mx.opt.create(name = "adam", learning.rate = 0.001, beta1 = 0.9, beta2 = 0.999,
                             epsilon = 1e-08, wd = 0)

利用卷積神經網路做手寫數字辨識(4)

– 這是一個閹割版的LeNet,原版的LeNet其第一、二層的卷積器數量分別是20以及50,而第一個全連接層具有500個神經元。這樣一個閹割版的小網路他的總參數量將與剛剛我們所使用的5隱藏層多層感知器相當。

# input
data <- mx.symbol.Variable('data')

# first conv
conv1 <- mx.symbol.Convolution(data=data, kernel=c(5,5), num_filter=10, name = 'conv1')
relu1 <- mx.symbol.Activation(data=conv1, act_type="relu")
pool1 <- mx.symbol.Pooling(data=relu1, pool_type="max",
                          kernel=c(2,2), stride=c(2,2))
# second conv
conv2 <- mx.symbol.Convolution(data=pool1, kernel=c(5,5), num_filter=20, name = 'conv2')
relu2 <- mx.symbol.Activation(data=conv2, act_type="relu")
pool2 <- mx.symbol.Pooling(data=relu2, pool_type="max",
                          kernel=c(2,2), stride=c(2,2))
# first fullc
flatten <- mx.symbol.Flatten(data=pool2)
fc1 <- mx.symbol.FullyConnected(data=flatten, num_hidden=150, name = 'fc1')
relu3 <- mx.symbol.Activation(data=fc1, act_type="relu")

# second fullc
fc2 <- mx.symbol.FullyConnected(data=relu3, num_hidden=10, name = 'fc2')

# Softmax
lenet <- mx.symbol.softmax(data = fc2, axis = 1, name = 'lenet')

# m-log loss
label = mx.symbol.Variable(name = 'label')

eps = 1e-8
m_log = 0 - mx.symbol.mean(mx.symbol.broadcast_mul(mx.symbol.log(lenet + eps), label))
m_logloss = mx.symbol.MakeLoss(m_log, name = 'm_logloss')

– 第一層卷積組合

  1. 原始圖片(28x28x1)要先經過10個5x5的「卷積器」(5x5x1x10)處理,將使圖片變成10張「一階特徵圖」(24x24x10)

  2. 接著這10張「一階特徵圖」(24x24x10)會經過ReLU,產生10張「轉換後的一階特徵圖」(24x24x10)

  3. 接著這10張「轉換後的一階特徵圖」(24x24x10)再經過2x2「池化器」(2x2)處理,將使圖片變成10張「降維後的一階特徵圖」(12x12x10)

– 第二層卷積組合

  1. 再將10張「降維後的一階特徵圖」(12x12x10)經過20個5x5的「卷積器」(5x5x10x20)處理,將使圖片變成20張「二階特徵圖」(8x8x20)

  2. 接著這20張「二階特徵圖」(8x8x20)會經過ReLU,產生20張「轉換後的二階特徵圖」(8x8x20)

  3. 接著這20張「轉換後的二階特徵圖」(8x8x20)再經過2x2「池化器」(2x2)處理,將使圖片變成20張「降維後的二階特徵圖」(4x4x20)

– 全連接層

  1. 將「降維後的二階特徵圖」(4x4x20)重新排列,壓製成「一階高級特徵」(320)

  2. 讓「一階高級特徵」(320)進入「隱藏層」,輸出「二階高級特徵」(150)

  3. 「二階高級特徵」(150)經過ReLU,輸出「轉換後的二階高級特徵」(150)

  4. 「轉換後的二階高級特徵」(150)進入「輸出層」,產生「原始輸出」(10)

  5. 「原始輸出」(10)經過Softmax函數轉換,判斷圖片是哪個類別

利用卷積神經網路做手寫數字辨識(5)

lenet_model = my.model.FeedForward.create(Iterator = my_iter, ctx = mx.cpu(), save.grad = TRUE,
                                          loss_symbol = m_logloss, pred_symbol = lenet,
                                          Optimizer = my_optimizer, num_round = 30)

library(data.table)

DAT = fread("data/test_data.csv", data.table = FALSE)
DAT = data.matrix(DAT)

Test.X = t(DAT[,-1])
dim(Test.X) = c(28, 28, 1, ncol(Test.X))
Test.Y = DAT[,1]

predict_Y = predict(lenet_model, Test.X)
confusion_table = table(max.col(t(predict_Y)), Test.Y)
cat("Testing accuracy rate =", sum(diag(confusion_table))/sum(confusion_table))
## Testing accuracy rate = 0.98
print(confusion_table)
##     Test.Y
##         0    1    2    3    4    5    6    7    8    9
##   1  1657    1    9    1    4    4    8    0    4    6
##   2     2 1825    8    1    1    0    2    3    4    0
##   3     0    5 1621    9    1    0    0    8    5    3
##   4     1    0    4 1684    0    2    0    2    3    0
##   5     0    5    0    0 1565    2    1    5    7    9
##   6     0    2    0   27    0 1527    2    0   15    7
##   7     2    1    3    0    5    8 1648    0   15    0
##   8     0    4    5    4    4    1    0 1708    0    3
##   9     1    4    6   10    2    3    0    4 1618    3
##   10    0    4    0    6   24    4    0   23    4 1611

練習3:重現CNN推理過程

PARAMS = lenet_model$arg.params
ls(PARAMS)
## [1] "conv1_bias"   "conv1_weight" "conv2_bias"   "conv2_weight"
## [5] "fc1_bias"     "fc1_weight"   "fc2_bias"     "fc2_weight"

– 第一層卷積組合

  1. 原始圖片(28x28x1)要先經過10個5x5的「卷積器」(5x5x1x10)處理,將使圖片變成10張「一階特徵圖」(24x24x10)

  2. 接著這10張「一階特徵圖」(24x24x10)會經過ReLU,產生10張「轉換後的一階特徵圖」(24x24x10)

  3. 接著這10張「轉換後的一階特徵圖」(24x24x10)再經過2x2「池化器」(2x2)處理,將使圖片變成10張「降維後的一階特徵圖」(12x12x10)

– 第二層卷積組合

  1. 再將10張「降維後的一階特徵圖」(12x12x10)經過20個5x5的「卷積器」(5x5x10x20)處理,將使圖片變成20張「二階特徵圖」(8x8x520)

  2. 接著這20張「二階特徵圖」(8x8x20)會經過ReLU,產生20張「轉換後的二階特徵圖」(8x8x20)

  3. 接著這20張「轉換後的二階特徵圖」(8x8x20)再經過2x2「池化器」(2x2)處理,將使圖片變成20張「降維後的二階特徵圖」(4x4x20)

– 全連接層

  1. 將「降維後的二階特徵圖」(4x4x20)重新排列,壓製成「一階高級特徵」(320)

  2. 讓「一階高級特徵」(320)進入「隱藏層」,輸出「二階高級特徵」(150)

  3. 「二階高級特徵」(150)經過ReLU,輸出「轉換後的二階高級特徵」(150)

  4. 「轉換後的二階高級特徵」(150)進入「輸出層」,產生「原始輸出」(10)

  5. 「原始輸出」(10)經過Softmax函數轉換,判斷圖片是哪個類別

Input = Test.X[,,,1]
dim(Input) = c(28, 28, 1, 1)
preds = predict(lenet_model, Input)
pred.label = max.col(t(preds)) - 1

par(mar=rep(0,4))
plot(NA, xlim = 0:1, ylim = 0:1, xaxt = "n", yaxt = "n", bty = "n")
img = as.raster(t(matrix(as.numeric(Input)/255, nrow = 28)))
rasterImage(img, -0.04, -0.04, 1.04, 1.04, interpolate=FALSE)
text(0.05, 0.95, Test.Y[1], col = "green", cex = 2)
text(0.95, 0.95, pred.label, col = "blue", cex = 2)

練習3答案

PARAMS = lenet_model$arg.params

Input = Test.X[,,,1]
dim(Input) = c(28, 28, 1, 1)

Conv1_out = array(0, dim = c(24, 24, 10))

for (i in 1:10) {
  for (j in 1:24) {
    for (k in 1:24) {
      Conv1_out[j,k,i] <- sum(Input[j+(0:4),k+(0:4),,] * as.array(PARAMS$conv1_weight)[,,,i]) + as.array(PARAMS$conv1_bias)[i]
    } 
  }
}

ReLU1_out = Conv1_out
ReLU1_out[ReLU1_out < 0] = 0

Pool1_out = array(0, dim = c(12, 12, 10))

for (i in 1:10) {
  for (j in 1:12) {
    for (k in 1:12) {
      Pool1_out[j,k,i] <- max(ReLU1_out[j*2+(-1:0),k*2+(-1:0),i])
    } 
  }
}

Conv2_out = array(0, dim = c(8, 8, 20))

for (i in 1:20) {
  for (j in 1:8) {
    for (k in 1:8) {
      Conv2_out[j,k,i] <- sum(Pool1_out[j+(0:4),k+(0:4),] * as.array(PARAMS$conv2_weight)[,,,i]) + as.array(PARAMS$conv2_bias)[i]
    } 
  }
}

ReLU2_out = Conv2_out
ReLU2_out[ReLU2_out < 0] = 0

Pool2_out = array(0, dim = c(4, 4, 20))

for (i in 1:20) {
  for (j in 1:4) {
    for (k in 1:4) {
      Pool2_out[j,k,i] <- max(ReLU2_out[j*2+(-1:0),k*2+(-1:0),i])
    } 
  }
}

Flatten_out = as.numeric(Pool2_out)
fc1_out = Flatten_out %*% as.array(PARAMS$fc1_weight) + as.array(PARAMS$fc1_bias)

ReLU3_out = fc1_out
ReLU3_out[ReLU3_out < 0] = 0

fc2_out = ReLU3_out %*% as.array(PARAMS$fc2_weight) + as.array(PARAMS$fc2_bias)
Softmax_out = exp(fc2_out)/sum(exp(fc2_out))

all.equal(preds, t(Softmax_out))
## [1] TRUE

影像裁減預測技術(1)

– 在第四節課中所提到的幾個技術都相當容易實現(L2正則化、Dropout),而數據擴增是一門大學問,尤其在影像上更是如此。

F6_10

影像裁減預測技術(2)

my_iterator_func <- setRefClass("Custom_Iter",
                                fields = c("iter", "data.csv", "data.shape", "batch.size"),
                                contains = "Rcpp_MXArrayDataIter",
                                methods = list(
                                  initialize = function(iter, data.csv, data.shape, batch.size){
                                    csv_iter <- mx.io.CSVIter(data.csv = data.csv, data.shape = data.shape, batch.size = batch.size)
                                    .self$iter <- csv_iter
                                    .self
                                  },
                                  value = function(){
                                    val <- as.array(.self$iter$value()$data)
                                    val.x <- val[-1,]
                                    val.y <- t(model.matrix(~ -1 + factor(val[1,], levels = 0:9)))
                                    val.y <- array(val.y, dim = c(10, ncol(val.x)))
                                    dim(val.x) <- c(28, 28, 1, ncol(val.x)) 
                                    random_row = sample(0:4, 1) + 1:24   #random select rows
                                    random_col = sample(0:4, 1) + 1:24   #random select cols
                                    val.x <- val.x[random_row,random_col,,] #croping
                                    dim(val.x) <- c(24, 24, 1, ncol(val.y)) #reshaping
                                    val.x <- mx.nd.array(val.x)
                                    val.y <- mx.nd.array(val.y)
                                    list(data=val.x, label=val.y)
                                  },
                                  iter.next = function(){
                                    .self$iter$iter.next()
                                  },
                                  reset = function(){
                                    .self$iter$reset()
                                  },
                                  finalize=function(){
                                  }
                                )
)

my_iter = my_iterator_func(iter = NULL,  data.csv = 'data/train_data.csv', data.shape = 785, batch.size = 20)
my_iter$reset()
my_iter$iter.next()
## [1] TRUE
my_value = my_iter$value()

imageShow(matrix(as.numeric(as.array(my_value$data)[,,,1]), nrow = 24, byrow = TRUE))

print(as.array(my_value$label)[,1])
##  [1] 0 0 0 0 0 0 0 0 1 0

影像裁減預測技術(3)

– 我們這裡先介紹一個最常見的擴增方法:10 Crop-evaluation

– 這個方法的邏輯是取圖像的左上、左下、右上、右下、正中等5個Crop,以及這5個Crop的水平翻轉鏡像,並且將這10個Crop的預測結果做平均求得最終答案

Input = Test.X[,,,5]
dim(Input) = c(28, 28, 1, 1)

par(mar=rep(0,4), mfrow = c(3, 2))

plot(NA, xlim = 0:1, ylim = 0:1, xaxt = "n", yaxt = "n", bty = "n")
img = as.raster(t(matrix(as.numeric(Input)/255, nrow = 28)))
rasterImage(img, -0.02, -0.02, 1.02, 1.02, interpolate=FALSE)
text(0.5, 0.5, 'original', col = 'orange', cex = 2)

plot(NA, xlim = 0:1, ylim = 0:1, xaxt = "n", yaxt = "n", bty = "n")
img = as.raster(t(matrix(as.numeric(Input[1:24,1:24,,])/255, nrow = 24)))
rasterImage(img, -0.02, -0.02, 1.02, 1.02, interpolate=FALSE)
text(0.5, 0.5, 'topleft', col = 'green', cex = 2)

plot(NA, xlim = 0:1, ylim = 0:1, xaxt = "n", yaxt = "n", bty = "n")
img = as.raster(t(matrix(as.numeric(Input[1:24,5:28,,])/255, nrow = 24)))
rasterImage(img, -0.02, -0.02, 1.02, 1.02, interpolate=FALSE)
text(0.5, 0.5, 'bottomleft', col = 'green', cex = 2)

plot(NA, xlim = 0:1, ylim = 0:1, xaxt = "n", yaxt = "n", bty = "n")
img = as.raster(t(matrix(as.numeric(Input[5:28,1:24,,])/255, nrow = 24)))
rasterImage(img, -0.02, -0.02, 1.02, 1.02, interpolate=FALSE)
text(0.5, 0.5, 'topright', col = 'green', cex = 2)

plot(NA, xlim = 0:1, ylim = 0:1, xaxt = "n", yaxt = "n", bty = "n")
img = as.raster(t(matrix(as.numeric(Input[5:28,5:28,,])/255, nrow = 24)))
rasterImage(img, -0.02, -0.02, 1.02, 1.02, interpolate=FALSE)
text(0.5, 0.5, 'bottomright', col = 'green', cex = 2)

plot(NA, xlim = 0:1, ylim = 0:1, xaxt = "n", yaxt = "n", bty = "n")
img = as.raster(t(matrix(as.numeric(Input[3:26,3:26,,])/255, nrow = 24)))
rasterImage(img, -0.02, -0.02, 1.02, 1.02, interpolate=FALSE)
text(0.5, 0.5, 'center', col = 'green', cex = 2)

練習4:進行影像裁減數據擴增的訓練並且出預測函數

– Model architecture:

# input
data <- mx.symbol.Variable('data')

# first conv
conv1 <- mx.symbol.Convolution(data=data, kernel=c(5,5), num_filter=10, name = 'conv1')
relu1 <- mx.symbol.Activation(data=conv1, act_type="relu")
pool1 <- mx.symbol.Pooling(data=relu1, pool_type="max",
                          kernel=c(2,2), stride=c(2,2))
# second conv
conv2 <- mx.symbol.Convolution(data=pool1, kernel=c(5,5), num_filter=20, name = 'conv2')
relu2 <- mx.symbol.Activation(data=conv2, act_type="relu")
pool2 <- mx.symbol.Pooling(data=relu2, pool_type="max",
                          kernel=c(2,2), stride=c(2,2))
# first fullc
flatten <- mx.symbol.Flatten(data=pool2)
fc1 <- mx.symbol.FullyConnected(data=flatten, num_hidden=150, name = 'fc1')
relu3 <- mx.symbol.Activation(data=fc1, act_type="relu")

# second fullc
fc2 <- mx.symbol.FullyConnected(data=relu3, num_hidden=10, name = 'fc2')

# Softmax
lenet <- mx.symbol.softmax(data = fc2, axis = 1, name = 'lenet')

# m-log loss
label = mx.symbol.Variable(name = 'label')

eps = 1e-8
m_log = 0 - mx.symbol.mean(mx.symbol.broadcast_mul(mx.symbol.log(lenet + eps), label))
m_logloss = mx.symbol.MakeLoss(m_log, name = 'm_logloss')
my_optimizer = mx.opt.create(name = "adam", learning.rate = 0.001, beta1 = 0.9, beta2 = 0.999,
                             epsilon = 1e-08, wd = 0)
lenet_model = my.model.FeedForward.create(Iterator = my_iter, ctx = mx.cpu(), save.grad = TRUE,
                                          loss_symbol = m_logloss, pred_symbol = lenet,
                                          Optimizer = my_optimizer, num_round = 30)

– 請你寫出預測函數,並且看看準確度如何!

練習4答案(1)

library(data.table)

DAT = fread("data/test_data.csv", data.table = FALSE)
DAT = data.matrix(DAT)

Test.X = t(DAT[,-1])
dim(Test.X) = c(28, 28, 1, ncol(Test.X))
Test.X1 = Test.X[1:24,1:24,,]
dim(Test.X1) = c(24, 24, 1, dim(Test.X)[4])
Test.X2 = Test.X[5:28,1:24,,]
dim(Test.X2) = c(24, 24, 1, dim(Test.X)[4])
Test.X3 = Test.X[1:24,5:28,,]
dim(Test.X3) = c(24, 24, 1, dim(Test.X)[4])
Test.X4 = Test.X[5:28,5:28,,]
dim(Test.X4) = c(24, 24, 1, dim(Test.X)[4])
Test.X5 = Test.X[3:26,3:26,,]
dim(Test.X5) = c(24, 24, 1, dim(Test.X)[4])

Test.Y = DAT[,1]

predict_Y1 = predict(lenet_model, Test.X1)
predict_Y2 = predict(lenet_model, Test.X2)
predict_Y3 = predict(lenet_model, Test.X3)
predict_Y4 = predict(lenet_model, Test.X4)
predict_Y5 = predict(lenet_model, Test.X5)

predict_Y = (predict_Y1 + predict_Y2 + predict_Y3 + predict_Y4 + predict_Y5) / 5

confusion_table = table(max.col(t(predict_Y)), Test.Y)
cat("Testing accuracy rate =", sum(diag(confusion_table))/sum(confusion_table))
## Testing accuracy rate = 0.9870833
print(confusion_table)
##     Test.Y
##         0    1    2    3    4    5    6    7    8    9
##   1  1660    0    4    0    1    1    5    0    5    5
##   2     1 1840    3    0    5    0    3    4    4    1
##   3     0    4 1629    1    0    0    2    4    3    0
##   4     0    0    8 1726    0    2    0    0    1    2
##   5     0    3    0    0 1571    0    4    3    0    4
##   6     0    0    0    7    0 1542    8    0   16    7
##   7     1    0    0    0    1    3 1638    0    8    0
##   8     0    3    9    0    2    1    0 1732    0    7
##   9     0    1    3    6    1    1    1    1 1630    1
##   10    1    0    0    2   25    1    0    9    8 1615

練習4答案(2)

my_predict = function (X, model, ctx = mx.cpu(), batch_size = 100) {
  
  require(abind)
  
  Pred_executor = mx.simple.bind(symbol = model$symbol, data = c(24, 24, 1, batch_size), ctx = ctx)
  
  len_X = ceiling(dim(X)[4]/batch_size)
  
  pred_result_list = list()
  
  for (i in 1:len_X) {  
    
    idx = (i-1) * batch_size + 1:batch_size
    idx[idx > dim(X)[4]] = 1
    
    sub_X_list = list()
    sub_X_list[[1]] = X[1:24,1:24,,idx]
    sub_X_list[[2]] = X[5:28,1:24,,idx]
    sub_X_list[[3]] = X[1:24,5:28,,idx]
    sub_X_list[[4]] = X[5:28,5:28,,idx]
    sub_X_list[[5]] = X[3:26,3:26,,idx]
    
    pred_result = 0
    
    for (j in 1:5) {
      
      dim(sub_X_list[[j]]) = c(24, 24, 1, batch_size)
      
      mx.exec.update.arg.arrays(Pred_executor, model$arg.params, match.name = TRUE)
      mx.exec.update.aux.arrays(Pred_executor, model$aux.params, match.name = TRUE)
      mx.exec.update.arg.arrays(Pred_executor, list(data = mx.nd.array(sub_X_list[[j]])), match.name = TRUE)
      mx.exec.forward(Pred_executor, is.train = TRUE)
      
      pred_result = pred_result + as.array(Pred_executor$ref.outputs[[1]])/5
      
    }
    
    pred_result_list[[i]] = pred_result
    
  }
  
  final_pred = abind(pred_result_list, along = 2)
  
  return(final_pred[,1:dim(X)[4]])
  
}
predict_Y = my_predict(X = Test.X, model = lenet_model, ctx = mx.cpu(), batch_size = 100)

confusion_table = table(max.col(t(predict_Y)), Test.Y)
cat("Testing accuracy rate =", sum(diag(confusion_table))/sum(confusion_table))
## Testing accuracy rate = 0.9870833
print(confusion_table)
##     Test.Y
##         0    1    2    3    4    5    6    7    8    9
##   1  1660    0    4    0    1    1    5    0    5    5
##   2     1 1840    3    0    5    0    3    4    4    1
##   3     0    4 1629    1    0    0    2    4    3    0
##   4     0    0    8 1726    0    2    0    0    1    2
##   5     0    3    0    0 1571    0    4    3    0    4
##   6     0    0    0    7    0 1542    8    0   16    7
##   7     1    0    0    0    1    3 1638    0    8    0
##   8     0    3    9    0    2    1    0 1732    0    7
##   9     0    1    3    6    1    1    1    1 1630    1
##   10    1    0    0    2   25    1    0    9    8 1615

結語

  1. 他會考慮各像素之間真實的相關性,而非像多層感知器一樣把每個像素視為完全獨立的特徵。這一點可以參考人類的視覺系統,我想你應該能認同你的眼睛具有平移不變性(shift invariance)的特色,因此CNN因為完美的模仿了視覺系統造就了在測試集中的高準確性。

  2. CNN由於權重共享的特性,導致相當的節省參數量。從過擬合的角度思考,這暗示著我們可以用較小的參數量完成複雜的網路,因此較不容易過擬合;從參數量固定的狀況下思考,CNN可以拼湊出較為複雜的結構。因此CNN的結構在影像辨識上具有相當強大的優勢!

F6_11